/*
** Data Cleaning and Preparation **

Due to multiple partner possibilities, I have decided the best way to run my program is to make the decision:
	Where do you get the majority of your imports from?


This program will make a dataset of the form:

m xt xtm1 pt ptm1 ct ctm1


foreach num of local hs10list {
! mkdir raw_`num'
}
*/

set more off


global data ../china/data
global do ../china/estimation/do
global quality ../china/quality

local hs10listall 8521900000 8525209070 8520900080 8471500085 9405106010 7321116000 8528215501 8414513000 ///
8471604580 6403996040 9401790005 8525404000 8471801000 6403999031 4011201015 8708704545 8467210010 6403916040 ///
3925301000 8528127201 8415103040 6402999030 8472909080 8519990045 8467220070 6402999060 8414519090 /*7202700000*/ ///
3926201010 8509100080 6302319020 6110121060 9401308030 8516290030 8712003500 8516500090 8539310060 8708395030 ///
6111206010 8527136040 8527316040 3926201020 8418210010 3913902000 8516710020 6402991815 4412140540 9401790015 ///
6306229030 8471704065 8471300000 6403999065 6204699044 6307909889 6402991865 9405408000 0304203035


use $data/imp2005china, clear
append using $data/imp2006china

drop if rel==1
gen city=substr(manuf_id,-3,3)

/* Assigns 3-letter cities to individual provinces */
include $do/provinces.do 
provinces
drop if province==""

gen p=v/q
gen log_price=log(p)

gen hs4=substr(hs,1,4)
merge m:1 hs manuf_id year using $quality/quality_est_final, keepusing(lambda)
drop if _m==2

save $data/all0506_hs10, replace


foreach num of local hs10listall {

global raw "../china/estimation/hs10/biggest/raw_`num'"


use $data/all0506_hs10, clear
keep if hs=="`num'"
drop if lambda==.

save $data/test2, replace

use $data/test2, clear
set more off
/* First of all, let's clean up unreasonable price values */
summ p, d
egen price_med=median(p)
_pctile p, p(1 90 95)
_pctile p, p(1 99)
*drop if p>r(r3) & price_med*10<r(r3)
*drop if p>r(r2) & price_med*10<r(r2)
local p_high=r(r2)
local p_low=r(r1)
drop if p>`p_high'
drop if p<`p_low'
drop price_med
egen price_med=median(p)
drop if p>10*price_med
drop price_med

gen log_p=log(p)


egen max_year=max(year)
egen min_year=min(year)



/* (1) Calculate the average price in 2005 at each exporter.  This will be merged back on later. */
preserve
keep if year==min_year
bysort manuf_id: egen price_minyear=mean(log_price)
keep manuf_id price_minyear
duplicates drop manuf_id, force
save $raw/price_minyear, replace
restore

/* (1.5) Do the same for the average price in 2006 at each exporter.  This will help with counterfactuals. */
preserve
keep if year==max_year
bysort manuf_id: egen price_maxyear=mean(log_price)
keep manuf_id price_maxyear
duplicates drop manuf_id, force
save $raw/price_maxyear, replace
restore





/* (2) Calculate the price increase in a city from 2005 to 2006.  This will be merged back on later. */
preserve

egen c=group(city)
egen city_total=max(c)

bysort city year: egen city_price=mean(log_price)

local C=city_total
forv i=1/`C' {
	gen city`i'preprice=city_price if c==`i' & year==min_year
	egen city`i'pre=max(city`i'preprice)
	gen city`i'postprice=city_price if c==`i' & year!=min_year
	egen city`i'post=max(city`i'postprice)
}

forv i=1/`C' {
	gen city`i'diff=city`i'post-city`i'pre
}

gen citydiff=0
forv i=1/`C' {
	replace citydiff=city`i'diff if c==`i'
}

gen citypre=0
gen citypost=0
forv i=1/`C' {
	replace citypre=city`i'pre if c==`i'
	replace citypost=city`i'post if c==`i'
}


keep city citydiff citypre citypost
drop if citydiff==. /* Can be missing if a city is only found in one year. I have handchecked. */
duplicates drop city, force
save $raw/citydiff, replace
restore



/* Now that I have calculated those preliminary objects, I can start winnowing down the sample.
	-Only use those importers found in both years */
gen in05=(year==2005)
bysort firmid: egen imp_in05=max(in05)
keep if imp_in05==1
gen in06=(year==max_year)
bysort firmid: egen imp_in06=max(in06)
keep if imp_in06==1
drop in05 in06 imp_in05 imp_in06


order firmid year manuf_id v q log_price city province
keep firmid year manuf_id v q log_price city province min_year lambda
*ren lambda_imputed_hs4 lambda
sort firmid manuf_id year

/* 3) Computing the price state boundaries: */


* If I want to change the number of price states, it has to be changed here:
local N=5
local NP1 = `N'+1
local NM1 = `N'-1
egen minp=min(log_price)
egen maxp=max(log_price)

preserve



keep if year==min_year

*range statepts minp maxp `NP1'
pctile statep=log_price, nquantiles(`N')
gen statepts=minp
replace statepts=statep[_n-1] if _n!=1
replace statepts=maxp if _n==`NP1'


keep in 1/`NP1'

outfile statepts using $raw/statepts.raw, replace



restore

****** Do the "majority" shortcut.
* Merge on the exporter price information now: this way we won't be stuck with a majority exporter with no price

merge m:1 manuf_id using $raw/price_minyear
/* _m==3 means we have a price for them in 2005.  _m==1 means 2006 only.  _m==2 means that the only obs was
from an importer not found in 2006 */
drop if _m==2
*drop if _m==3
drop _m

merge m:1 manuf_id using $raw/price_maxyear
/* _m==3 means we have a price for them in 2006.  _m==1 means that EXPORTER is only found in 2005.  _m==2 means that the 
IMPORTER from which an exporter's price was calculated in 2006 was not found in both years, meaning there is no exporter to match to. */
drop if _m==2
*drop if _m==3
drop _m

* Merge on the city information now: this way we won't be stuck with a majority exporter with no city info.
merge m:1 city using $raw/citydiff
/* _m==3 means we have a city difference for them.  _m==1 means only 1 year of city info.  _m==2 means that 
the only obs was from an importer not found in 2006 */
drop if _m!=3
drop _m

bysort firmid year: egen tot_v=total(v)
gen individual_v_share=v/tot_v
bysort firmid year: egen max_ind_v=max(individual_v_share)
gen max_v=(max_ind_v==individual_v_share)
drop if max_v!=1

bysort firmid: gen count=_N
drop if count!=2
drop count

duplicates tag firmid year, gen(dup)
drop if dup==1
drop dup

** Now I can start my labelling



rename log_price pt

egen ct=group(city)
cap drop city_total
egen city_total=max(ct)
local C=city_total

egen m=group(firmid)
egen importers=max(m)
local M=importers

egen xt=group(manuf_id)
egen exporters=max(xt)
local X=exporters

sort firmid year
gen ct1=ct[_n+1] if year==min_year
gen xt1=xt[_n+1] if year==min_year
gen pt1=pt[_n+1] if year==min_year
gen price_minyeart1=price_minyear[_n+1] if year==min_year /* This is necessary b/c the surviving price_minyear is that of the old exporter not the new */

gen lambdat1=lambda[_n+1] if year==min_year
drop if year!=min_year

gen stayed=(xt==xt1)

egen rank_v=rank(tot_v), field
sort firmid year
drop if rank_v>50 & year==2005


gen exp_price=pt+citydiff if stayed==1


preserve
rename citydiff newcitydiff
rename citypre newcitypre
rename citypost newcitypost
keep ct newcitydiff newcitypre newcitypost
duplicates drop ct, force
rename ct ct1
save $raw/newcitydiff, replace
restore

merge m:1 ct1 using $raw/newcitydiff
* _m==2 means that in the end, no one moved to that city.  (only existed in 2005 in final data) Just drop.
* _m==1 means that city was never there in 2005 to calculate a difference for (only existed in 2006)
* Thus can just drop unless _m==3
drop if _m!=3
drop _m

forv j=1/`C' {
	replace price_minyeart1=newcitypre if ct1==`j' & price_minyeart1==.
}

/* Because of the difference between min and max year, this is not t1.  We need to replace the ones with
missing price_maxyear, as they won't be appended on after like the other variables. */

forv j=1/`C' {
	replace price_maxyear=newcitypost if ct1==`j' & price_maxyear==.
}


forv i=1/`X' {
	gen switchp`i'=price_minyear if xt==`i'
	egen p`i'=max(switchp`i')
	forv j=1/`C' {
		replace p`i'=newcitypre if p`i'==. & ct1==`j'
		}
}

forv i=1/`X' {
	replace exp_price=p`i'+ newcitydiff if stayed==0 & xt1==`i'
}

gen w=pt1-exp_price
/* The only missing observations are those that:
	-Had no information about the exporter before going there (no pre-price)
	-No information about the city before going there (no pre-city price)
At this point, I think it has been cleaned a lot- XX out of XX observations are missing.
OK drop them at this point. */
drop if w==.


save $raw/final, replace

** Standardize the exporter codes so they go from 1-X_max
use $raw/final, clear
keep xt1
rename xt1 xt
save $raw/xt1, replace

use $raw/final, clear
keep xt
append using $raw/xt1
duplicates drop xt, force
egen xxt=group(xt)
save $raw/xxt, replace
rename xt xt1
rename xxt xxt1
save $raw/xxt1, replace


** Standardize the city codes so they go from 1-C_max
use $raw/final, clear
keep ct1
rename ct1 ct
save $raw/ct1, replace

use $raw/final, clear
keep ct
append using $raw/ct1
duplicates drop ct, force
egen cct=group(ct)
save $raw/cct, replace
rename ct ct1
rename cct cct1
save $raw/cct1, replace


use $raw/final, clear
merge m:1 xt using $raw/xxt
drop if _m==2
drop _m
merge m:1 xt1 using $raw/xxt1
drop if _m==2
drop _m
merge m:1 ct using $raw/cct
drop if _m==2
drop _m
merge m:1 ct1 using $raw/cct1
drop if _m==2
drop _m





* For the case when there is no xxt=1
local o=_N
local p=`o'+1
set obs `p'
replace xxt1=1 in `p'
gen fakeone=0
replace fakeone=1 in `p'
sort xxt1
bysort xxt1: gen nobs=_N
drop if fakeone==1 & nobs>1
drop nobs

* For the case when there is no xxt=MAX
local o=_N
local p=`o'+1
set obs `p'

egen xmax=max(xxt)
local k=xmax

replace xxt1=xmax in `p'
gen fakemax=0
replace fakemax=1 in `p'
sort xxt1
bysort xxt1: gen nobs=_N
drop if fakemax==1 & nobs>1
drop nobs xmax




sort xxt1
gen xxt1_diff=xxt1[_n+1]-xxt1
expand xxt1_diff, gen(fake)
sort xxt1 fake
bysort xxt1 fake: gen obs=_n
replace xxt1=xxt1+obs if fake==1
replace fake=1 if fakeone==1
replace fake=1 if fakemax==1
replace w=0 if fake==1
sort xxt1
outfile xxt1 w using $raw/w.raw, replace

drop if fake==1
drop if xxt==.

local N=5
local NP1 = `N'+1
local NM1 = `N'-1

xtile imppricestate=pt, nquantiles(`N')

sort xxt

egen ind_v=total(v)
gen firmshare=v/ind_v

outfile imppricestate xxt xxt1 using $raw/imp.raw, replace
outfile firmshare using $raw/firmshare.raw, replace

save $raw/final2, replace

keep xxt1 price_minyeart1 price_maxyeart1 cct1 newcitydiff lambdat1
rename xxt1 xxt
rename price_minyeart1 price_minyear
rename price_maxyeart1 price_maxyear
rename cct1 cct
rename newcitydiff citydiff
rename lambdat1 lambda
save $raw/t1, replace

use $raw/final2, clear

append using $raw/t1

sort xxt
duplicates drop xxt, force
outfile price_minyear using $raw/price_minyear.raw, replace
outfile price_maxyear using $raw/price_maxyear.raw, replace
outfile cct lambda using $raw/cities.raw, replace
sort cct
duplicates drop cct, force
outfile citydiff using $raw/citydiff.raw, replace

}


foreach num of local hs10list {
global raw "../china/estimation/hs10/biggest/raw_`num'"
use $raw/final2, clear
append using $raw/t1
sort xxt
duplicates drop xxt, force
gen nobs=_N
gen hs="`num'"
keep nobs hs
save $raw/nobs, replace
}

